{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ee8f59ca",
   "metadata": {},
   "source": [
    "### Filtering criteria for transcriptional readthrough SVs \n",
    "\n",
    "Filtering criteria:\n",
    "* SV is located upstream of misexpressed gene\n",
    "* SV overlaps 3' end of upstream gene. Upstream gene: \n",
    "    * On same strand of misexpressed gene \n",
    "    * Expressed in whole blood (median TPM > 0.5)\n",
    "    * SV overlaps a terminal exon polyA site \n",
    "* No intervening expressed genes between misexpressed gene and upstream gene\n",
    "    * Intervening genes are not overlapped by SV or misexpressed gene \n",
    "* Closest gene to SV "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c87c3ac9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "from pybedtools import BedTool\n",
    "from io import StringIO\n",
    "import pysam\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e221b472",
   "metadata": {},
   "outputs": [],
   "source": [
    "wkdir = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3\"\n",
    "wkdir_path = Path(wkdir)\n",
    "# input paths\n",
    "misexp_vrnt_feat_path = wkdir_path.joinpath(\"6_misexp_dissect/vrnt_features/misexp_vrnt_features.tsv\")\n",
    "tpm_mtx_path = wkdir_path.joinpath(\"1_rna_seq_qc/tpm_mtx/tpm_4568samples_59144genes_smpl_qc.csv\")\n",
    "polya_site_atlas_path = wkdir_path.joinpath(\"reference/polyA_site/atlas.clusters.2.0.GRCh38.96.tsv.gz\")\n",
    "polya_site_atlas_bed_path = wkdir_path.joinpath(\"reference/polyA_site/atlas.clusters.2.0.GRCh38.96.bed.gz\")\n",
    "vrnt_id_in_window_path = wkdir_path.joinpath(\"5_misexp_vrnts/test_cntrl_sets/vrnt_id_in_windows_chr_num_misexp_genes.bed\")\n",
    "gencode_bed_path = wkdir_path.joinpath(\"3_misexp_genes/bed_files/all_genes.bed\")\n",
    "# output directory \n",
    "out_dir=wkdir_path.joinpath(\"6_misexp_dissect/tx_readthrough/deletions\")\n",
    "out_dir_path = Path(out_dir)\n",
    "out_dir_path.mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3aa1000b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of misexpression-associated deletions: 87\n"
     ]
    }
   ],
   "source": [
    "# load misexpression-associated variants \n",
    "misexp_vrnt_feat_df = pd.read_csv(misexp_vrnt_feat_path, sep=\"\\t\")\n",
    "# list of misexpression-associated DELs \n",
    "misexp_del_feat_df = misexp_vrnt_feat_df[misexp_vrnt_feat_df.SVTYPE == \"DEL\"]\n",
    "misexp_dels = misexp_del_feat_df.vrnt_id.unique()\n",
    "print(f\"Number of misexpression-associated deletions: {len(misexp_dels)}\")\n",
    "# load genes .bed file \n",
    "gencode_bed = BedTool(gencode_bed_path)\n",
    "# load gene expression TPM matrix \n",
    "tpm_mtx_df = pd.read_csv(tpm_mtx_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "465f66f1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of genes with median TPM > 0.5: 17418\n"
     ]
    }
   ],
   "source": [
    "# identify active genes in INTERVAL (median TPM > 0.5)\n",
    "tpm_cutoff = 0.5\n",
    "# median TPM of gene across INTERVAL \n",
    "median_tpm_df = pd.DataFrame(tpm_mtx_df.set_index(\"gene_id\").median(axis=1))\n",
    "median_tpm_df = median_tpm_df.rename(columns={0:\"median_tpm\"}).reset_index()\n",
    "genes_median_tpm05 = set(median_tpm_df[median_tpm_df.median_tpm > tpm_cutoff].gene_id.unique())\n",
    "print(f\"Number of genes with median TPM > {tpm_cutoff}: {len(genes_median_tpm05)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d9acbcb0",
   "metadata": {},
   "source": [
    "**Annotate DELs positioned upstream of misexpressed gene**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "8a78c962",
   "metadata": {},
   "outputs": [],
   "source": [
    "vrnt_gene_dist_df_list = []\n",
    "misexp_del_gene_df = misexp_del_feat_df[[\"vrnt_id\", \"chrom\", \"sv_start\", \"sv_end\", \"gene_id\", \"gene_start\", \"gene_end\", \"gene_strand\"]].drop_duplicates()\n",
    "for index, row in misexp_del_gene_df.iterrows(): \n",
    "    # calculate shortest distance between SV and misexpressed gene \n",
    "    vrnt_id, chrom, sv_start, sv_end = row[\"vrnt_id\"], row[\"chrom\"], row[\"sv_start\"], row[\"sv_end\"], \n",
    "    gene_id, gene_start, gene_end, gene_strand = row[\"gene_id\"], row[\"gene_start\"], row[\"gene_end\"], row[\"gene_strand\"]\n",
    "    sv_bed = BedTool(f\"{chrom} {sv_start} {sv_end} {vrnt_id}\", from_string=True)\n",
    "    gene_bed = BedTool(f\"{chrom} {gene_start} {gene_end} {gene_id} 0 {gene_strand}\", from_string=True)\n",
    "    # -D uses negative distances to report closest upstream features\n",
    "    gene_bed_distance_str = StringIO(str(gene_bed.closest(sv_bed, D=\"a\")))\n",
    "    gene_bed_distance_df = pd.read_csv(gene_bed_distance_str, sep=\"\\t\", header=None)\n",
    "    vrnt_gene_dist_df_list.append(gene_bed_distance_df)\n",
    "vrnt_gene_dist_cols = {0:\"gene_chrom\", 1:\"gene_start\", 2:\"gene_end\", 3:\"misexp_gene_id\", \n",
    "                       4:\"score\", 5:\"gene_strand\", 6:\"sv_chrom\", 7:\"sv_start\", 8:\"sv_end\", \n",
    "                       9:\"vrnt_id\", 10:\"distance\"}\n",
    "vrnt_gene_dist_df = pd.concat(vrnt_gene_dist_df_list).rename(columns=vrnt_gene_dist_cols)\n",
    "vrnt_gene_dist_df['vrnt_id'] = vrnt_gene_dist_df['vrnt_id'].astype(str)\n",
    "if misexp_del_gene_df.shape[0] != vrnt_gene_dist_df.shape[0]: \n",
    "    raise ValueError(\"Variant-gene pair number in input does not match output.\")\n",
    "# select variants upstream \n",
    "vrnt_id_upstream_df = vrnt_gene_dist_df[vrnt_gene_dist_df.distance < 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "2c7eb991",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of deletions upstream of misexpressed genes: 54\n",
      "Upstream variants identical: True\n"
     ]
    }
   ],
   "source": [
    "vrnt_id_upstream = set(vrnt_id_upstream_df.vrnt_id.unique())\n",
    "print(f\"Number of deletions upstream of misexpressed genes: {len(vrnt_id_upstream)}\")\n",
    "# check against custom annotation method\n",
    "vrnt_id_upstream_feat_df = misexp_del_feat_df[misexp_del_feat_df.position == \"Upstream\"]\n",
    "vrnt_id_upstream_feat = set(vrnt_id_upstream_feat_df.vrnt_id.unique())\n",
    "print(f\"Upstream variants identical: {vrnt_id_upstream==vrnt_id_upstream_feat}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "90b570d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# subset to relevant variant features and upstream DELs \n",
    "misexp_del_sv_feat_df = misexp_del_feat_df[[\"vrnt_id\", \"chrom\", \"sv_start\", \"sv_end\", \"gene_id\", \"gene_start\", \"gene_end\", \"gene_strand\", \"position\"]].drop_duplicates()\n",
    "misexp_gene_cols = {\"gene_id\": \"misexp_gene_id\", \"gene_start\": \"misexp_gene_start\", \n",
    "                    \"gene_end\": \"misexp_gene_end\", \"gene_strand\": \"misexp_gene_strand\"}\n",
    "misexp_del_sv_feat_df = misexp_del_sv_feat_df.rename(columns=misexp_gene_cols)\n",
    "# subset to SVs positioned upstream \n",
    "misexp_del_upstream_df = misexp_del_sv_feat_df[misexp_del_sv_feat_df.position == \"Upstream\"].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "28d9d1d8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# add distance between upstream SV and misexpressed gene \n",
    "del_upstream_vrnt_gene_distance_df = vrnt_id_upstream_df[[\"vrnt_id\", \"misexp_gene_id\", \"distance\"]].drop_duplicates()\n",
    "misexp_del_sv_feat_upstream_df = pd.merge(misexp_del_upstream_df, \n",
    "                                          del_upstream_vrnt_gene_distance_df, \n",
    "                                          on=[\"vrnt_id\", \"misexp_gene_id\"], \n",
    "                                          how=\"left\")\n",
    "# check dimensions do not change \n",
    "len(misexp_del_sv_feat_upstream_df) == len(misexp_del_upstream_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "744b1fb0",
   "metadata": {},
   "source": [
    "**Annotate upstream DELs that have no active genes on the same strand as the misexpressed gene in the readthrough region**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "b78d2694",
   "metadata": {},
   "outputs": [],
   "source": [
    "def no_active_intervening_gene(row):\n",
    "    \"\"\" \n",
    "    Check if there is an active gene on the same strand as the misexpressed gene in the upstream \n",
    "    region between the SV and the misexpressed gene. Input must be subset to upstream SVs for \n",
    "    valid .bed file coordinates. \n",
    "    \n",
    "    Returns True if no active intervening gene and False if there is one. \n",
    "    \"\"\"\n",
    "    region_gene_cols = {0:\"sv_chrom\", 1:\"sv_start\", 2:\"sv_end\", 3:\"name\", 4:\"score\", 5:\"strand\",  \n",
    "                    6: \"gene_chrom\", 7:\"gene_start\", 8:\"gene_end\", 9:\"gene_id\", 10:\"gene_score\", \n",
    "                    11:\"gene_strand\", 12:\"overlap\"\n",
    "                   }\n",
    "    \n",
    "    chrom = row[\"chrom\"]\n",
    "    gene_strand = row[\"misexp_gene_strand\"]\n",
    "    vrnt_gene_id = f'{row[\"vrnt_id\"]}_{row[\"misexp_gene_id\"]}'\n",
    "    if row[\"misexp_gene_strand\"] == \"+\":\n",
    "        region_start = row[\"sv_end\"]\n",
    "        region_end = row[\"misexp_gene_start\"]\n",
    "    elif row[\"misexp_gene_strand\"] == \"-\":\n",
    "        region_start = row[\"misexp_gene_end\"]\n",
    "        region_end = row[\"sv_start\"]   \n",
    "    else: \n",
    "        raise ValueError(\"Strand not recognised.\")\n",
    "    # create .bed file for region and intersect with gene bed file \n",
    "    region_bed = BedTool(f\"{chrom} {region_start} {region_end} {vrnt_gene_id} 0 {gene_strand}\", from_string=True)\n",
    "    # require gene to be completely enclosed in the readthrough region F=1 \n",
    "    region_gene_intersect_str = StringIO(str(region_bed.intersect(gencode_bed, wo=True, F=1)))\n",
    "    # check if string is empty - if empty no intervening genes return False \n",
    "    # if not empty check if intervening gens are active \n",
    "    if region_gene_intersect_str.getvalue().strip():\n",
    "        region_gene_intersect_str.seek(0) # move cursor to beginning\n",
    "        region_gene_intersect_df = pd.read_csv(region_gene_intersect_str, sep=\"\\t\", header=None, dtype={3:str}).rename(columns=region_gene_cols)\n",
    "        # subset to genes on the same strand as the misexpressed gene \n",
    "        region_gene_intersect_shared_strand_df = region_gene_intersect_df[region_gene_intersect_df.gene_strand == gene_strand]\n",
    "        genes_in_region = set(region_gene_intersect_shared_strand_df.gene_id.unique())\n",
    "        if len(genes_in_region.intersection(genes_median_tpm05)) > 0: \n",
    "            return False\n",
    "        else: \n",
    "            return True\n",
    "    else: \n",
    "        return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "fea7fa09",
   "metadata": {},
   "outputs": [],
   "source": [
    "misexp_del_sv_feat_upstream_df[\"no_active_gene\"] = misexp_del_sv_feat_upstream_df.apply(no_active_intervening_gene, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4b2ef9f9",
   "metadata": {},
   "source": [
    "**Annotated genes that overlap 3' end of any gene**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "56669fb6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load variants in tested windows \n",
    "vrnt_id_in_window_chr_path = wkdir_path.joinpath(\"5_misexp_vrnts/test_cntrl_sets/vrnt_id_in_windows_misexp_genes.bed\")\n",
    "vrnt_id_in_window_chr_bed = BedTool(vrnt_id_in_window_chr_path)\n",
    "# intersect SVs and genes \n",
    "sv_overlap_gene_str = StringIO(str(vrnt_id_in_window_chr_bed.intersect(gencode_bed, wo=True)))\n",
    "# read intersect between SVs and genes \n",
    "columns={0: \"sv_chrom\", 1: \"sv_start\", 2: \"sv_end\", 3:\"vrnt_id\", \n",
    "         4:\"gene_chrom\", 5:\"overlap_gene_start\", 6:\"overlap_gene_end\", 7:\"overlap_gene_id\", \n",
    "         8:\"score\", 9:\"overlap_gene_strand\", 10:\"gene_overlap\"\n",
    "        }\n",
    "sv_overlap_gene_df = pd.read_csv(sv_overlap_gene_str, sep=\"\\t\", header=None, dtype={3:str}).rename(columns=columns)\n",
    "# subset to relevant features \n",
    "sv_overlap_gene_feat_df = sv_overlap_gene_df[[\"vrnt_id\", \"overlap_gene_start\", \"overlap_gene_end\", \"overlap_gene_id\", \"overlap_gene_strand\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "bc18cfc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def overlaps_gene_3_prime(row): \n",
    "    if row[\"overlap_gene_strand\"] == \"+\": \n",
    "        return (row[\"sv_start\"] < row[\"overlap_gene_end\"]) & (row[\"sv_start\"] > row[\"overlap_gene_start\"]) & (row[\"sv_end\"] > row[\"overlap_gene_end\"])\n",
    "    elif row[\"overlap_gene_strand\"] == \"-\": \n",
    "        return (row[\"sv_start\"] < row[\"overlap_gene_start\"]) & (row[\"overlap_gene_end\"] > row[\"sv_end\"]) & (row[\"overlap_gene_start\"] < row[\"sv_end\"])\n",
    "    else: \n",
    "        return np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "d876e31f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# add overlapping genes to misexpression DELs that are upstream \n",
    "misexp_del_overlap_genes_df = pd.merge(misexp_del_sv_feat_upstream_df, \n",
    "                                       sv_overlap_gene_feat_df, \n",
    "                                       on=\"vrnt_id\", \n",
    "                                       how=\"left\")\n",
    "# annotate genes that overlap the 3' end \n",
    "misexp_del_overlap_genes_df[\"overlap_gene_3_prime\"] = misexp_del_overlap_genes_df.apply(overlaps_gene_3_prime, axis=1)\n",
    "# variant overlaps gene that is expressed in blood \n",
    "misexp_del_overlap_genes_df[\"overlap_gene_expressed\"] = np.where(misexp_del_overlap_genes_df.overlap_gene_id.isin(genes_median_tpm05), True, False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3ff78440",
   "metadata": {},
   "source": [
    "**Annotate SVs that overlaps a terminal exon (TE) polyA site in the 3' overlapping gene**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "f637ccdb",
   "metadata": {},
   "outputs": [],
   "source": [
    "### polyA site (PAS) overlap \n",
    "\n",
    "# polyA information\n",
    "polya_site_df = pd.read_csv(polya_site_atlas_path, sep=\"\\t\", dtype={0:str}).rename(columns={\"gene_id\": \"gene_id\"})\n",
    "# polyA sites (PAS) bed\n",
    "polya_site_atlas_bed = BedTool(polya_site_atlas_bed_path)\n",
    "# load variants in tested windows \n",
    "vrnt_id_in_window_bed = BedTool(vrnt_id_in_window_path)\n",
    "# intersect with polyA sites \n",
    "sv_overlap_polya_str = StringIO(str(vrnt_id_in_window_bed.intersect(polya_site_atlas_bed, wo=True)))\n",
    "polya_intersect_columns={0: \"sv_chrom\", 1: \"sv_start\", 2: \"sv_end\", 3:\"vrnt_id\", \n",
    "         4:\"polya_chrom\", 5:\"polya_start\", 6:\"polya_end\", 7:\"name\", \n",
    "         8:\"polya_mean_tpm_1\", 9:\"polya_strand\", 10:\"polya_frac\", 11:\"polya_support\", \n",
    "         12:\"polya_mean_tpm_2\",13:\"polya_annotation\", 14:\"polya_signals\", \n",
    "         15:\"overlap\"\n",
    "        }\n",
    "sv_overlap_polya_df = pd.read_csv(sv_overlap_polya_str, sep=\"\\t\", header=None, dtype={3:str}).rename(columns=polya_intersect_columns)\n",
    "# subset to misexpression deletions \n",
    "misexp_dels_polya_df = sv_overlap_polya_df[sv_overlap_polya_df.vrnt_id.isin(misexp_dels)]\n",
    "# add polyA site gene ID \n",
    "polya_site_gene_id_df = polya_site_df[[\"name\", \"gene_id\"]]\n",
    "misexp_del_polya_gene_id_df = pd.merge(misexp_dels_polya_df, polya_site_gene_id_df, on=\"name\", how=\"left\") \n",
    "# subset only to TE (terminal exon) polyA sites \n",
    "misexp_del_polya_te_gene_id_df = misexp_del_polya_gene_id_df[misexp_del_polya_gene_id_df.polya_annotation == \"TE\"]\n",
    "# expand out genes with multiple annotations separated by |\n",
    "misexp_del_overlap_gene_polya_te_df = misexp_del_polya_te_gene_id_df[[\"vrnt_id\", \"gene_id\", \"polya_strand\"]].drop_duplicates()\n",
    "misexp_del_overlap_gene_polya_te_df[\"overlap_gene_id\"] = misexp_del_overlap_gene_polya_te_df.gene_id.str.split(\"|\")\n",
    "misexp_del_overlap_gene_polya_te_df = misexp_del_overlap_gene_polya_te_df.explode(\"overlap_gene_id\", ignore_index=True)\n",
    "# drop gene ID column \n",
    "misexp_del_overlap_gene_polya_te_drop_dups_df = misexp_del_overlap_gene_polya_te_df.drop(columns=[\"gene_id\"]).drop_duplicates()\n",
    "misexp_del_overlap_gene_polya_te_drop_dups_df[\"polya_te_overlap\"] = True "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "bb9497af",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# merge with misexpressed deletion dataframe \n",
    "misexp_del_overlap_genes_polya_df = pd.merge(misexp_del_overlap_genes_df, \n",
    "                                             misexp_del_overlap_gene_polya_te_drop_dups_df, \n",
    "                                             on=[\"vrnt_id\", \"overlap_gene_id\"], \n",
    "                                             how=\"left\")\n",
    "# check dataframe length unchanged \n",
    "len(misexp_del_overlap_genes_polya_df) == len(misexp_del_overlap_genes_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "5453fa51",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of DELs upstream of misexpressed gene: 54\n",
      "Remaining DELs upstream of misexpressed gene with no intervening active gene on same strand: 52\n",
      "Remaining DELs overlapping 3' end of upstream gene: 13\n",
      "Remaining DELs overlap polyA site in upstream gene: 13\n",
      "Upstream gene is expressed: 12\n",
      "Upstream gene is on same strand: 12\n",
      "Final set of candidate tx readthrough DELs: 12\n"
     ]
    }
   ],
   "source": [
    "### Transcriptional readthrough variant prioritisation \n",
    "\n",
    "# number of variants upstream \n",
    "num_upstream_misexp_dels = misexp_del_overlap_genes_polya_df.vrnt_id.unique()\n",
    "print(f\"Number of DELs upstream of misexpressed gene: {len(num_upstream_misexp_dels)}\")\n",
    "# number of upstream DELs with no active gene between SV breakpoint and misexpressed gene \n",
    "filter_1_df = misexp_del_overlap_genes_polya_df[(misexp_del_overlap_genes_polya_df.position == \"Upstream\") & \n",
    "                                                (misexp_del_overlap_genes_polya_df.no_active_gene)]\n",
    "filter_1_vrnt_ids = set(filter_1_df.vrnt_id.unique())\n",
    "print(f\"Remaining DELs upstream of misexpressed gene with no intervening active gene on same strand: {len(filter_1_vrnt_ids)}\")\n",
    "\n",
    "# number of variants overlapping 3' end of a gene and polyA site \n",
    "filter_2_df = filter_1_df[(filter_1_df.overlap_gene_3_prime == True) \n",
    "                         ]\n",
    "filter_2_vrnt_ids = set(filter_2_df.vrnt_id.unique())\n",
    "print(f\"Remaining DELs overlapping 3' end of upstream gene: {len(filter_2_vrnt_ids)}\")\n",
    "\n",
    "# SV overlaps polyA site in the 3' region of the overlapped gene\n",
    "filter_3_df = filter_2_df[filter_2_df.polya_te_overlap == True]\n",
    "filter_3_vrnt_ids = set(filter_3_df.vrnt_id.unique())\n",
    "print(f\"Remaining DELs overlap polyA site in upstream gene: {len(filter_3_vrnt_ids)}\")\n",
    "\n",
    "# upstream gene is expressed\n",
    "filter_4_df = filter_3_df[(filter_3_df.overlap_gene_expressed == True)]\n",
    "filter_4_vrnt_ids = set(filter_4_df.vrnt_id.unique())\n",
    "print(f\"Upstream gene is expressed: {len(filter_4_vrnt_ids)}\")\n",
    "\n",
    "# upstream gene is on same strand \n",
    "filter_5_df = filter_4_df[(filter_4_df.misexp_gene_strand == filter_4_df.overlap_gene_strand)]\n",
    "filter_5_vrnt_ids = set(filter_5_df.vrnt_id.unique())\n",
    "print(f\"Upstream gene is on same strand: {len(filter_5_vrnt_ids)}\")    \n",
    "\n",
    "misexp_tx_read_vrnt_ids = filter_5_vrnt_ids\n",
    "misexp_tx_read_vrnts_df = filter_5_df.reset_index(drop=True)\n",
    "print(f\"Final set of candidate tx readthrough DELs: {len(misexp_tx_read_vrnt_ids)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5036a5b2",
   "metadata": {},
   "source": [
    "**Select the closest gene to the SV if multiple genes are misexpressed**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "923dab5d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# N.B. use idxmax as distances are negative for upstream\n",
    "idx_closest_distance = misexp_tx_read_vrnts_df.groupby(\"vrnt_id\").distance.idxmax()\n",
    "misexp_tx_read_vrnts_closest_df = misexp_tx_read_vrnts_df.loc[idx_closest_distance].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "08590e8d",
   "metadata": {},
   "source": [
    "**Metrics of transcriptional readthrough candidate DELs**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "be1c461d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of transcriptional readthrough candidate variant-gene pairs: 12\n",
      "Transcriptional readthrough candidate variants: {'293588', 'DEL_chr2_100567565_100578556', '252546', '179078', '284739', 'DEL_chr12_102008109_102015494', 'DEL_chr17_58705468_58742168', 'DEL_chr8_103436284_103461045', '201247', '123122', '188910', 'DEL_chr3_187069321_187094542'}\n"
     ]
    }
   ],
   "source": [
    "# merge with misexpression variant features for metrics \n",
    "misex_tx_read_vrnt_gene_df = misexp_tx_read_vrnts_closest_df[[\"vrnt_id\", \"misexp_gene_id\", \"overlap_gene_id\", \"distance\"]].drop_duplicates()\n",
    "misex_tx_read_vrnt_gene_df = misex_tx_read_vrnt_gene_df.rename(columns={\"misexp_gene_id\": \"gene_id\"})\n",
    "print(f\"Number of transcriptional readthrough candidate variant-gene pairs: {len(misex_tx_read_vrnt_gene_df)}\")\n",
    "print(f\"Transcriptional readthrough candidate variants: {set(misex_tx_read_vrnt_gene_df.vrnt_id.tolist())}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "502fee1a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# add variant features \n",
    "misexp_tx_read_vrnt_feat_df = pd.merge(misex_tx_read_vrnt_gene_df, \n",
    "                                       misexp_vrnt_feat_df, \n",
    "                                       on=[\"vrnt_id\", \"gene_id\"], \n",
    "                                       how=\"inner\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "33fb71b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write to file \n",
    "misexp_tx_read_vrnt_feat_path = out_dir_path.joinpath(\"misexp_del_tx_readthrough_candidates.tsv\")\n",
    "misexp_tx_read_vrnt_feat_df.to_csv(misexp_tx_read_vrnt_feat_path, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "b4619e10",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write variants to list \n",
    "misexp_tx_read_vrnts = misexp_tx_read_vrnt_feat_df.vrnt_id.unique()\n",
    "misexp_tx_read_vrnts_path = out_dir_path.joinpath(\"misexp_del_tx_vrnts.txt\") \n",
    "with open(misexp_tx_read_vrnts_path, 'w') as f_out: \n",
    "    for vrnt_id in misexp_tx_read_vrnts: \n",
    "        f_out.write(f\"{vrnt_id}\\n\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
